視覺化在探索性資料分析中佔有舉足輕重的地位,因為對人類來說暸解原始數列資料或者函數是極其困難的。
v <- rnorm(1000)
print(v[1:10])
[1] -1.77235202 0.11606663 1.19037167 0.23963170 -1.90994644 -1.18449108 [7] 0.14373728 1.57955591 -0.06194262 -1.93744960
hist(v)
x <- seq(from = -10, to = 10, length.out = 100)
f <- 1/(1 + exp(-x))
plot(x, f, type = "l")
gapminder 範例資料集¶gapminder 套件 gapminder 套件gapminder 範例資料集的外觀¶library("gapminder")
print(dim(gapminder))
[1] 1704 6
head(gapminder, 3)
| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| <fct> | <fct> | <int> | <dbl> | <int> | <dbl> |
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
head(gapminder, 3)
| country | continent | year | lifeExp | pop | gdpPercap |
|---|---|---|---|---|---|
| <fct> | <fct> | <int> | <dbl> | <int> | <dbl> |
| Afghanistan | Asia | 1952 | 28.801 | 8425333 | 779.4453 |
| Afghanistan | Asia | 1957 | 30.332 | 9240934 | 820.8530 |
| Afghanistan | Asia | 1962 | 31.997 | 10267083 | 853.1007 |
gapminder 範例資料集有幾個國家?幾個洲別?¶print(length(unique(gapminder$country)))
print(unique(gapminder$continent))
[1] 142 [1] Asia Europe Africa Americas Oceania Levels: Africa Americas Asia Europe Oceania
gapminder 範例資料集有哪些年份?¶print(unique(gapminder$year))
[1] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
ggplot2 基礎¶ggplot2?¶ggplot2 以簡潔、彈性和美觀輸出快速擄獲資料科學團隊的芳心;命名之中 gg 指的是 grammer of graphics,套件作者是 Hadley Wickham 與 Winston Chang,核心理念是利用正規而有結構的文法來探索資料。
ggplot2 圖形¶ggplot() 函數做資料映射。geom_() 函數調整圖形種類。+ 連結不同的函數,堆疊圖層。library("gapminder") # data
library("ggplot2") # plotting
suppressMessages(library("dplyr")) # data manipulations
使用 ggplot(aes(x, y)) + geom_point()
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point()
使用 ggplot(aes(x)) + geom_bar()
gapminder %>%
filter(year == 2007) %>%
ggplot(aes(x = continent)) +
geom_bar()
使用 ggplot(aes(x, y)) + geom_bar(stat = "identity")
gapminder %>%
filter(year == 2007) %>%
mutate(pop_numeric = as.numeric(pop)) %>%
group_by(continent) %>%
summarise(ttl_pop = sum(pop_numeric)) %>%
ggplot(aes(x = continent, y = ttl_pop)) +
geom_bar(stat = "identity")
`summarise()` ungrouping output (override with `.groups` argument)
使用 ggplot(aes(x)) + geom_histogram()
gapminder %>%
ggplot(aes(x = gdpPercap)) +
geom_histogram(bins = 40)
使用 ggplot(aes(x, y)) + geom_line()
gapminder %>%
filter(country %in% c("Taiwan", "Japan", "China")) %>%
ggplot(aes(x = year, y = gdpPercap, color = country)) +
geom_line()
使用 ggplot(aes(x, y)) + geom_boxplot()
gapminder %>%
ggplot(aes(x = continent, y = gdpPercap, color = continent)) +
geom_boxplot()
ggplot2 技巧¶使用 ggtitle() + xlab() + ylab() 函數。
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
ggtitle("Wealth vs. Health") +
xlab("GDP Per Capita") +
ylab("Life Expectancy")
使用 geom_text() 函數。
n_obs <- gapminder %>%
group_by(continent) %>%
summarise(nrows = n())
n_obs %>%
ggplot(aes(x = continent, y = nrows)) +
geom_bar(stat = "identity") +
geom_text(aes(label = nrows, y = nrows), vjust = -1)
`summarise()` ungrouping output (override with `.groups` argument)
使用 theme(text = element_text(family = FONTS_SUPPORT_TC)) 函數。
p <- gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
ggtitle("財富與健康") +
xlab("人均 GDP") +
ylab("預期壽命") +
theme(text = element_text(family = "Heiti TC Light"))
suppressWarnings(print(p))
使用 scale_x_continuous() 與 scale_y_continuous() 函數調整座標軸上下界與量尺。
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point(na.rm = TRUE) +
scale_x_continuous(limits = c(0, 50000))
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp)) +
geom_point() +
scale_x_continuous(trans = "log10")
使用 facet_wrap(vars(CATEGORICAL_COLUMN)) 函數。
gapminder %>%
ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
facet_wrap(vars(continent))
ggplot2 視覺化真實世界資料¶get_daily_report <- function() {
file_date <- format(Sys.Date() - 2, "%m-%d-%Y")
csv_url <- paste0("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/",
"csse_covid_19_daily_reports/",
file_date,
".csv"
)
daily_report <- read.csv(csv_url)
return(daily_report)
}
daily_report <- get_daily_report()
head(daily_report)
| FIPS | Admin2 | Province_State | Country_Region | Last_Update | Lat | Long_ | Confirmed | Deaths | Recovered | Active | Combined_Key | Incident_Rate | Case_Fatality_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <int> | <chr> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <int> | <int> | <lgl> | <lgl> | <chr> | <dbl> | <dbl> | |
| 1 | NA | Afghanistan | 2021-09-02 04:21:15 | 33.93911 | 67.70995 | 153260 | 7123 | NA | NA | Afghanistan | 393.6977 | 4.647658 | ||
| 2 | NA | Albania | 2021-09-02 04:21:15 | 41.15330 | 20.16830 | 147369 | 2501 | NA | NA | Albania | 5120.8910 | 1.697100 | ||
| 3 | NA | Algeria | 2021-09-02 04:21:15 | 28.03390 | 1.65960 | 196527 | 5302 | NA | NA | Algeria | 448.1695 | 2.697848 | ||
| 4 | NA | Andorra | 2021-09-02 04:21:15 | 42.50630 | 1.52180 | 15046 | 130 | NA | NA | Andorra | 19473.2414 | 0.864017 | ||
| 5 | NA | Angola | 2021-09-02 04:21:15 | -11.20270 | 17.87390 | 47781 | 1227 | NA | NA | Angola | 145.3801 | 2.567966 | ||
| 6 | NA | Antigua and Barbuda | 2021-09-02 04:21:15 | 17.06080 | -61.79640 | 1719 | 44 | NA | NA | Antigua and Barbuda | 1755.3713 | 2.559628 |
confirmed_by_countries <- daily_report %>%
group_by(Country_Region) %>%
summarise(Confirmed = sum(Confirmed)) %>%
arrange(desc(Confirmed))
head(confirmed_by_countries)
`summarise()` ungrouping output (override with `.groups` argument)
| Country_Region | Confirmed |
|---|---|
| <chr> | <int> |
| US | 39396156 |
| India | 32857937 |
| Brazil | 20804215 |
| France | 6868151 |
| United Kingdom | 6856933 |
| Russia | 6838652 |
top_ten_countries <- rev(confirmed_by_countries$Country_Region[1:10])
p <- confirmed_by_countries %>%
head(10) %>%
mutate(Country_Region=factor(Country_Region, levels=top_ten_countries)) %>%
ggplot(aes(x = Country_Region, y = Confirmed)) +
geom_bar(stat = "identity") +
coord_flip()
p
top_ten_countries <- rev(confirmed_by_countries$Country_Region[1:10])
top_ten_confirmed <- confirmed_by_countries$Confirmed[1:10]
p <- confirmed_by_countries %>%
head(10) %>%
mutate(Country_Region=factor(Country_Region, levels=top_ten_countries)) %>%
ggplot(aes(x = Country_Region, y = Confirmed)) +
geom_bar(stat = "identity") +
geom_text(aes(label = top_ten_confirmed, y = top_ten_confirmed), hjust = -0.1) +
scale_y_continuous(limits = c(0, 50000000)) +
coord_flip()
p
library("tidyr")
get_time_series_confirmed <- function() {
csv_url <- paste0("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/",
"csse_covid_19_data/csse_covid_19_time_series/",
"time_series_covid19_confirmed_global.csv")
time_series_confirmed <- read.csv(csv_url)
cols_to_pivot_longer <- colnames(time_series_confirmed)[5:ncol(time_series_confirmed)]
time_series_confirmed_long <- time_series_confirmed[, c(2, 5:ncol(time_series_confirmed))] %>%
pivot_longer(cols = all_of(cols_to_pivot_longer),
names_to = "Date",
values_to = "Confirmed"
)
time_series_confirmed_long <- time_series_confirmed_long %>%
group_by(Country.Region, Date) %>%
summarise(Confirmed = sum(Confirmed))
time_series_confirmed_long$Date <- time_series_confirmed_long$Date %>%
sub(pattern = "X", replacement = "") %>%
gsub(pattern = ".", replacement = "-", fixed = TRUE) %>%
as.Date("%m-%d-%y")
return(time_series_confirmed_long)
}
time_series_confirmed <- get_time_series_confirmed()
time_series_confirmed
`summarise()` regrouping output by 'Country.Region' (override with `.groups` argument)
| Country.Region | Date | Confirmed |
|---|---|---|
| <chr> | <date> | <int> |
| Afghanistan | 2021-01-01 | 51526 |
| Afghanistan | 2021-01-10 | 53489 |
| Afghanistan | 2021-01-11 | 53538 |
| Afghanistan | 2021-01-12 | 53584 |
| Afghanistan | 2021-01-13 | 53584 |
| Afghanistan | 2021-01-14 | 53775 |
| Afghanistan | 2021-01-15 | 53831 |
| Afghanistan | 2021-01-16 | 53938 |
| Afghanistan | 2021-01-17 | 53984 |
| Afghanistan | 2021-01-18 | 54062 |
| Afghanistan | 2021-01-19 | 54141 |
| Afghanistan | 2021-01-02 | 51526 |
| Afghanistan | 2021-01-20 | 54278 |
| Afghanistan | 2021-01-21 | 54403 |
| Afghanistan | 2020-01-22 | 0 |
| Afghanistan | 2021-01-22 | 54483 |
| Afghanistan | 2020-01-23 | 0 |
| Afghanistan | 2021-01-23 | 54559 |
| Afghanistan | 2020-01-24 | 0 |
| Afghanistan | 2021-01-24 | 54595 |
| Afghanistan | 2020-01-25 | 0 |
| Afghanistan | 2021-01-25 | 54672 |
| Afghanistan | 2020-01-26 | 0 |
| Afghanistan | 2021-01-26 | 54750 |
| Afghanistan | 2020-01-27 | 0 |
| Afghanistan | 2021-01-27 | 54854 |
| Afghanistan | 2020-01-28 | 0 |
| Afghanistan | 2021-01-28 | 54891 |
| Afghanistan | 2020-01-29 | 0 |
| Afghanistan | 2021-01-29 | 54939 |
| ⋮ | ⋮ | ⋮ |
| Zimbabwe | 2020-09-10 | 7453 |
| Zimbabwe | 2020-09-11 | 7479 |
| Zimbabwe | 2020-09-12 | 7508 |
| Zimbabwe | 2020-09-13 | 7526 |
| Zimbabwe | 2020-09-14 | 7531 |
| Zimbabwe | 2020-09-15 | 7576 |
| Zimbabwe | 2020-09-16 | 7598 |
| Zimbabwe | 2020-09-17 | 7633 |
| Zimbabwe | 2020-09-18 | 7647 |
| Zimbabwe | 2020-09-19 | 7672 |
| Zimbabwe | 2020-09-02 | 6638 |
| Zimbabwe | 2021-09-02 | 125118 |
| Zimbabwe | 2020-09-20 | 7683 |
| Zimbabwe | 2020-09-21 | 7683 |
| Zimbabwe | 2020-09-22 | 7711 |
| Zimbabwe | 2020-09-23 | 7725 |
| Zimbabwe | 2020-09-24 | 7752 |
| Zimbabwe | 2020-09-25 | 7787 |
| Zimbabwe | 2020-09-26 | 7803 |
| Zimbabwe | 2020-09-27 | 7812 |
| Zimbabwe | 2020-09-28 | 7816 |
| Zimbabwe | 2020-09-29 | 7837 |
| Zimbabwe | 2020-09-03 | 6678 |
| Zimbabwe | 2020-09-30 | 7838 |
| Zimbabwe | 2020-09-04 | 6837 |
| Zimbabwe | 2020-09-05 | 6837 |
| Zimbabwe | 2020-09-06 | 6837 |
| Zimbabwe | 2020-09-07 | 7298 |
| Zimbabwe | 2020-09-08 | 7388 |
| Zimbabwe | 2020-09-09 | 7429 |
p <- time_series_confirmed %>%
filter(Country.Region == "Taiwan*") %>%
ggplot(aes(x = Date, y = Confirmed)) +
geom_line()
p
p <- time_series_confirmed %>%
filter(Country.Region %in% c("Taiwan*", "China", "Japan", "Korea, South", "Singapore")) %>%
ggplot(aes(x = Date, y = Confirmed, colour = Country.Region)) +
geom_line()
p
time_series_confirmed <- time_series_confirmed %>%
filter(Country.Region == "Taiwan*") %>%
arrange(Date)
confirmed_lag <- time_series_confirmed$Confirmed %>%
lag()
daily_increase <- time_series_confirmed$Confirmed - confirmed_lag
time_series_confirmed$Daily_Increase <- daily_increase
time_series_confirmed
| Country.Region | Date | Confirmed | Daily_Increase |
|---|---|---|---|
| <chr> | <date> | <int> | <int> |
| Taiwan* | 2020-01-22 | 1 | NA |
| Taiwan* | 2020-01-23 | 1 | 0 |
| Taiwan* | 2020-01-24 | 3 | 2 |
| Taiwan* | 2020-01-25 | 3 | 0 |
| Taiwan* | 2020-01-26 | 4 | 1 |
| Taiwan* | 2020-01-27 | 5 | 1 |
| Taiwan* | 2020-01-28 | 8 | 3 |
| Taiwan* | 2020-01-29 | 8 | 0 |
| Taiwan* | 2020-01-30 | 9 | 1 |
| Taiwan* | 2020-01-31 | 10 | 1 |
| Taiwan* | 2020-02-01 | 10 | 0 |
| Taiwan* | 2020-02-02 | 10 | 0 |
| Taiwan* | 2020-02-03 | 10 | 0 |
| Taiwan* | 2020-02-04 | 11 | 1 |
| Taiwan* | 2020-02-05 | 11 | 0 |
| Taiwan* | 2020-02-06 | 16 | 5 |
| Taiwan* | 2020-02-07 | 16 | 0 |
| Taiwan* | 2020-02-08 | 17 | 1 |
| Taiwan* | 2020-02-09 | 18 | 1 |
| Taiwan* | 2020-02-10 | 18 | 0 |
| Taiwan* | 2020-02-11 | 18 | 0 |
| Taiwan* | 2020-02-12 | 18 | 0 |
| Taiwan* | 2020-02-13 | 18 | 0 |
| Taiwan* | 2020-02-14 | 18 | 0 |
| Taiwan* | 2020-02-15 | 18 | 0 |
| Taiwan* | 2020-02-16 | 20 | 2 |
| Taiwan* | 2020-02-17 | 22 | 2 |
| Taiwan* | 2020-02-18 | 22 | 0 |
| Taiwan* | 2020-02-19 | 23 | 1 |
| Taiwan* | 2020-02-20 | 24 | 1 |
| ⋮ | ⋮ | ⋮ | ⋮ |
| Taiwan* | 2021-08-04 | 15742 | 21 |
| Taiwan* | 2021-08-05 | 15753 | 11 |
| Taiwan* | 2021-08-06 | 15765 | 12 |
| Taiwan* | 2021-08-07 | 15775 | 10 |
| Taiwan* | 2021-08-08 | 15782 | 7 |
| Taiwan* | 2021-08-09 | 15790 | 8 |
| Taiwan* | 2021-08-10 | 15798 | 8 |
| Taiwan* | 2021-08-11 | 15814 | 16 |
| Taiwan* | 2021-08-12 | 15820 | 6 |
| Taiwan* | 2021-08-13 | 15836 | 16 |
| Taiwan* | 2021-08-14 | 15843 | 7 |
| Taiwan* | 2021-08-15 | 15852 | 9 |
| Taiwan* | 2021-08-16 | 15862 | 10 |
| Taiwan* | 2021-08-17 | 15880 | 18 |
| Taiwan* | 2021-08-18 | 15891 | 11 |
| Taiwan* | 2021-08-19 | 15897 | 6 |
| Taiwan* | 2021-08-20 | 15906 | 9 |
| Taiwan* | 2021-08-21 | 15916 | 10 |
| Taiwan* | 2021-08-22 | 15926 | 10 |
| Taiwan* | 2021-08-23 | 15932 | 6 |
| Taiwan* | 2021-08-24 | 15938 | 6 |
| Taiwan* | 2021-08-25 | 15939 | 1 |
| Taiwan* | 2021-08-26 | 15947 | 8 |
| Taiwan* | 2021-08-27 | 15954 | 7 |
| Taiwan* | 2021-08-28 | 15960 | 6 |
| Taiwan* | 2021-08-29 | 15983 | 23 |
| Taiwan* | 2021-08-30 | 15991 | 8 |
| Taiwan* | 2021-08-31 | 15995 | 4 |
| Taiwan* | 2021-09-01 | 16001 | 6 |
| Taiwan* | 2021-09-02 | 16006 | 5 |
p <- time_series_confirmed %>%
filter(!is.na(Daily_Increase)) %>%
ggplot(aes(x = Date, y = Daily_Increase)) +
geom_bar(stat = "identity", na.rm = TRUE)
p
plotly 複製一個 gapminder¶plotly 套件¶幫助 R 語言使用者不需要額外去學習 JavaScript 就能夠建立出互動性、具備 D3.js 及 WebGL 特性的圖表。
suppressMessages(library("plotly"))
radius <- sqrt((gapminder$pop)/pi)
p <- gapminder %>%
plot_ly(
x = ~gdpPercap,
y = ~lifeExp,
size = ~pop,
color = ~continent,
frame = ~year,
text = ~country,
fill = ~'',
hoverinfo = "text",
type = 'scatter',
mode = 'markers',
sizes = c(min(radius), max(radius))
) %>%
layout(
xaxis = list(
type = "log"
)
)
p
ggplot() 函數做資料映射。geom_() 函數調整圖形種類。+ 連結不同的函數,堆疊圖層。plotly 套件。